IndexReader.java example

Explorer
Solandra-master
/**
 * Copyright 2009 T Jake Luciani
 * 
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package lucandra;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.cassandra.thrift.Cassandra;
import org.apache.cassandra.thrift.Column;
import org.apache.cassandra.thrift.ColumnOrSuperColumn;
import org.apache.cassandra.thrift.ColumnParent;
import org.apache.cassandra.thrift.ConsistencyLevel;
import org.apache.cassandra.thrift.SlicePredicate;
import org.apache.cassandra.thrift.SliceRange;
import org.apache.cassandra.thrift.SuperColumn;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.index.TermVectorMapper;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;

import solandra.SolandraFieldSelector;

public class IndexReader extends org.apache.lucene.index.IndexReader {

    private final static int numDocs = 1000000;
    
    private final static Directory mockDirectory = new RAMDirectory();
    static {
           
        try {
            new IndexWriter(mockDirectory, new SimpleAnalyzer(), true, MaxFieldLength.LIMITED);
        } catch (CorruptIndexException e) {
           throw new RuntimeException(e);
        } catch (LockObtainFailedException e) {
            throw new RuntimeException(e);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    private final String indexName;
    private final Cassandra.Iface client;
    
    private final ThreadLocal<Map<String, Integer>> docIdToDocIndex = new ThreadLocal<Map<String, Integer>>();
    private final ThreadLocal<Map<Integer, String>> docIndexToDocId = new ThreadLocal<Map<Integer, String>>();
    private final ThreadLocal<Map<Integer, Document>> documentCache = new ThreadLocal<Map<Integer, Document>>();
    private final ThreadLocal<AtomicInteger> docCounter = new ThreadLocal<AtomicInteger>();
    private final ThreadLocal<Map<Term, LucandraTermEnum>> termEnumCache = new ThreadLocal<Map<Term, LucandraTermEnum>>();
    private final ThreadLocal<Map<String,byte[]>> fieldNorms = new ThreadLocal<Map<String, byte[]>>();
    private final static ThreadLocal<Object>  fieldCacheRefs  = new ThreadLocal<Object>();

    
    private static final Logger logger = Logger.getLogger(IndexReader.class);

    public IndexReader(String name, Cassandra.Iface client) {
        super();
        this.indexName = name;
        this.client = client; 
    }

    public synchronized IndexReader reopen() throws CorruptIndexException, IOException {

        clearCache();

        return this;
    }

    @Override
    public Object getFieldCacheKey() {
        
        Object ref = fieldCacheRefs.get();
        
        if(ref == null){           
            ref = UUID.randomUUID();
            fieldCacheRefs.set(ref);     
        }
        
        return ref;        
    }
    
    public void clearCache() {
        
        if(docCounter.get() != null) docCounter.get().set(0);
        if(docIdToDocIndex.get() != null) docIdToDocIndex.get().clear();
        if(docIndexToDocId.get() != null) docIndexToDocId.get().clear();
        if(termEnumCache.get() != null) termEnumCache.get().clear();
        if(documentCache.get() != null) documentCache.get().clear();
        if(fieldNorms.get() != null) fieldNorms.get().clear();
        if (fieldCacheRefs.get() != null)
            fieldCacheRefs.set(UUID.randomUUID());
    }
    
    protected void doClose() throws IOException {
        clearCache();
    }

    
    protected void doCommit() throws IOException {
       clearCache();
    }

    
    protected void doDelete(int arg0) throws CorruptIndexException, IOException {

    }

    
    protected void doSetNorm(int arg0, String arg1, byte arg2) throws CorruptIndexException, IOException {

    }

    
    protected void doUndeleteAll() throws CorruptIndexException, IOException {

    }

    
    public int docFreq(Term term) throws IOException {

        LucandraTermEnum termEnum = getTermEnumCache().get(term);
        if (termEnum == null) {

            long start = System.currentTimeMillis();

            termEnum = new LucandraTermEnum(this);
            termEnum.skipTo(term);

            long end = System.currentTimeMillis();

            logger.debug("docFreq() took: " + (end - start) + "ms");

            getTermEnumCache().put(term, termEnum);
        }

        return termEnum.docFreq();
    }

    
    public Document document(int docNum, FieldSelector selector) throws CorruptIndexException, IOException {

        Document doc = getDocumentCache().get(docNum);

        if (doc != null){
            logger.debug("Found doc in cache");
            return doc;
        }

        String docId = getDocIndexToDocId().get(docNum);

        if (docId == null)
            return null;

        Map<Integer, String> keyMap = new HashMap<Integer, String>();

        keyMap.put(docNum, CassandraUtils.hashKey(indexName + CassandraUtils.delimeter + docId));

        
        List<byte[]> fieldNames = null;
        
        // Special field selector used to carry list of other docIds to cache in
        // Parallel for Solr Performance  
        if (selector != null && selector instanceof SolandraFieldSelector) {

            List<Integer> otherDocIds = ((SolandraFieldSelector) selector).getOtherDocsToCache();
            fieldNames = ((SolandraFieldSelector) selector).getFieldNames();
            
            logger.debug("Going to bulk load "+otherDocIds.size()+" documents");
            
            for (Integer otherDocNum : otherDocIds) {
                if (otherDocNum == docNum)
                    continue;

                if (getDocumentCache().containsKey(otherDocNum))
                    continue;

                String docKey = getDocIndexToDocId().get(otherDocNum);

                if (docKey == null)
                    continue;

                keyMap.put(otherDocNum, CassandraUtils.hashKey(indexName + CassandraUtils.delimeter + docKey));
            }           
        }
        
        ColumnParent columnParent = new ColumnParent();
        columnParent.setColumn_family(CassandraUtils.docColumnFamily);

        SlicePredicate slicePredicate = new SlicePredicate();
        
        if (fieldNames == null || fieldNames.size() == 0) {
            // get all columns ( except this skips meta info )
            slicePredicate.setSlice_range(new SliceRange(new byte[] {}, CassandraUtils.finalToken.getBytes("UTF-8"), false, 100));
        } else {
            
            slicePredicate.setColumn_names(fieldNames);
        }

       
        long start = System.currentTimeMillis();

        try {
            Map<String, List<ColumnOrSuperColumn>> docMap = client.multiget_slice(CassandraUtils.keySpace, Arrays.asList(keyMap.values().toArray(
                    new String[] {})), columnParent, slicePredicate, ConsistencyLevel.ONE);

            for (Map.Entry<Integer, String> key : keyMap.entrySet()) {

                List<ColumnOrSuperColumn> cols = docMap.get(key.getValue());

                if (cols == null) {
                    logger.warn("Missing document in multiget_slice for: " + key.getValue());
                    continue;
                }

                Document cacheDoc = new Document();

                for (ColumnOrSuperColumn col : cols) {

                    Field field = null;
                    String fieldName = new String(col.column.name);

                    //Incase __META__ slips through
                    if(Arrays.equals(col.column.name,CassandraUtils.documentMetaField.getBytes())){
                        logger.debug("Filtering out __META__ key");
                        continue;
                    }
                    
                    byte[] value;

                    if (col.column.value[col.column.value.length - 1] != Byte.MAX_VALUE && col.column.value[col.column.value.length - 1] != Byte.MIN_VALUE) {
                        throw new CorruptIndexException("Lucandra field is not properly encoded: "+docId+"("+fieldName+")");
                   
                    } else if (col.column.value[col.column.value.length - 1] == Byte.MAX_VALUE) { //Binary
                        value = new byte[col.column.value.length - 1];
                        System.arraycopy(col.column.value, 0, value, 0, col.column.value.length - 1);

                        field = new Field(fieldName, value, Store.YES);
                        cacheDoc.add(field);
                    } else if (col.column.value[col.column.value.length - 1] == Byte.MIN_VALUE) { //String
                        value = new byte[col.column.value.length - 1];
                        System.arraycopy(col.column.value, 0, value, 0, col.column.value.length - 1);
                        
                        //Check for multi-fields
                        String fieldString = new String(value,"UTF-8");
                        
                        if(fieldString.indexOf(CassandraUtils.delimeter) >= 0 ){
                            StringTokenizer tok = new StringTokenizer(fieldString,CassandraUtils.delimeter); 
                            while(tok.hasMoreTokens()) {
                                field = new Field(fieldName, tok.nextToken(), Store.YES, Index.ANALYZED);
                                cacheDoc.add(field);
                            }
                        }else{
                        
                            field = new Field(fieldName, fieldString, Store.YES, Index.ANALYZED);
                            cacheDoc.add(field);
                        }
                    }

                    
                }
                
                //Mark the required doc
                if(key.getKey().equals(docNum))
                    doc = cacheDoc;
                
                getDocumentCache().put(key.getKey(),cacheDoc);
            }

            long end = System.currentTimeMillis();

            logger.debug("Document read took: " + (end - start) + "ms");

            return doc;

        } catch (Exception e) {
            throw new IOException(e.getLocalizedMessage());
        }

    }

    @Override
    public Collection getFieldNames(FieldOption fieldOption) {
        return Arrays.asList(new String[] {});
    }

    @Override
    public TermFreqVector getTermFreqVector(int docNum, String field) throws IOException {

        String docId = getDocIndexToDocId().get(docNum);

        TermFreqVector termVector = new lucandra.TermFreqVector(indexName, field, docId, client);

        return termVector;
    }

    @Override
    public void getTermFreqVector(int arg0, TermVectorMapper arg1) throws IOException {
        throw new RuntimeException();
    }

    @Override
    public void getTermFreqVector(int arg0, String arg1, TermVectorMapper arg2) throws IOException {

        throw new RuntimeException();

    }

    @Override
    public TermFreqVector[] getTermFreqVectors(int arg0) throws IOException {
        throw new RuntimeException();
    }

    @Override
    public boolean hasDeletions() {

        return false;
    }

    @Override
    public boolean isDeleted(int arg0) {

        return false;
    }

    @Override
    public int maxDoc() {
        // if (numDocs == null)
        // numDocs();

        return numDocs + 1;
    }

    @Override
    public byte[] norms(String field) throws IOException {
        return  getFieldNorms().get(field);
    }

    @Override
    public void norms(String arg0, byte[] arg1, int arg2) throws IOException {

        throw new RuntimeException("This operation is not supported");
        
    }

    @Override
    public int numDocs() {

        return numDocs;
    }

    @Override
    public TermDocs termDocs() throws IOException {
        return new LucandraTermDocs(this);
    }

    @Override
    public TermPositions termPositions() throws IOException {
        return new LucandraTermDocs(this);
    }

    @Override
    public TermEnum terms() throws IOException {
        return new LucandraTermEnum(this);
    }

    @Override
    public TermEnum terms(Term term) throws IOException {

        LucandraTermEnum termEnum = getTermEnumCache().get(term);
        
        if(termEnum == null)
            termEnum = new LucandraTermEnum(this);
        
        if( !termEnum.skipTo(term) ) //if found in the cache then reset, otherwise init.
            termEnum = null;
        

        return termEnum;
    }

    public int addDocument(SuperColumn docInfo, String field) {

        String id;
        try {
            id = new String(docInfo.name, "UTF-8");
        } catch (UnsupportedEncodingException e) {
            throw new IllegalStateException("Cant make docId a string");
        }

        Integer idx = getDocIdToDocIndex().get(id);

        if (idx == null) {
            idx = getDocCounter().incrementAndGet();

            if (idx > numDocs)
                throw new IllegalStateException("numDocs reached");

            getDocIdToDocIndex().put(id, idx);
            getDocIndexToDocId().put(idx, id);
            
            Byte norm = null;
            for(Column c : docInfo.columns){
                if(Arrays.equals(c.name, CassandraUtils.normsKey.getBytes())){
                    if(c.value.length != 1)
                        throw new IllegalStateException("Norm for field "+field+" must be a single byte");
                    
                    norm = c.value[0];
                }                 
            }
            
            if(norm == null)
                norm = Similarity.encodeNorm(1.0f);
            
            byte[] norms = getFieldNorms().get(field);
            
            if (norms == null) 
                norms = new byte[1024];                         

            while(norms.length <= idx && norms.length < numDocs ){
                byte[] _norms = new byte[(norms.length * 2) < numDocs ? (norms.length * 2) : (numDocs + 1)];
                System.arraycopy(norms, 0, _norms, 0, norms.length);
                norms = _norms;           
            }

            // find next empty position
            norms[idx] = norm;

            getFieldNorms().put(field, norms);            
        }

        return idx;
    }
    
    public int getDocumentNumber(byte[] docId){
        String id;
        try {
            id = new String(docId, "UTF-8");
        } catch (UnsupportedEncodingException e) {
            throw new IllegalStateException("Cant make docId a string");
        }
        
        return getDocIdToDocIndex().get(id);
    }
    
    public String getDocumentId(int docNum) {
        return getDocIndexToDocId().get(docNum);
    }

    public String getIndexName() {
        return indexName;
    }

    public Cassandra.Iface getClient() {
        return client;
    }

    public LucandraTermEnum checkTermCache(Term term) {
        return getTermEnumCache().get(term);
    }

    public void addTermEnumCache(Term term, LucandraTermEnum termEnum) {
        getTermEnumCache().put(term, termEnum);
    }

    @Override
    public Directory directory() {
        clearCache();
        
        return mockDirectory;
    }

    @Override
    public long getVersion() {
        return 1;
    }

    @Override
    public boolean isOptimized() {
       return true;
    }
    
    @Override
    public boolean isCurrent() {
       return true;
    }

    public Map<Integer, String> getDocIndexToDocId() {
        Map<Integer, String> c = docIndexToDocId.get();
        
        if(c == null){
            c = new HashMap<Integer,String>();
            docIndexToDocId.set(c);
        }
        
        return c;
    }
    
    private Map<String,Integer> getDocIdToDocIndex(){
        Map<String, Integer> c = docIdToDocIndex.get();
        
        if(c == null){
            c = new HashMap<String,Integer>();
            docIdToDocIndex.set(c);
        }
        
        return c;
    }
    
    private AtomicInteger getDocCounter(){
        AtomicInteger c = docCounter.get();
        
        if(c == null){
            c = new AtomicInteger(0);
            docCounter.set(c);
        }
        
        return c;
    }
    
    private Map<Term,LucandraTermEnum> getTermEnumCache(){
        Map<Term,LucandraTermEnum> c = termEnumCache.get();
    
        if(c == null){
            c = new HashMap<Term,LucandraTermEnum>();
            termEnumCache.set(c);
        }
        
        return  c;
    }
    
    private Map<Integer,Document> getDocumentCache(){
        Map<Integer,Document> c = documentCache.get();
    
        if(c == null){
            c = new HashMap<Integer,Document>();
            documentCache.set(c);
        }
        
        return c;
    }
    
    private Map<String,byte[]> getFieldNorms(){
        Map<String, byte[]> c = fieldNorms.get();
        
        if(c == null){
            c = new HashMap<String,byte[]>();
            fieldNorms.set(c);
        }
        
        return c;
    }
    
}